In [ ]:
cd ..

In [ ]:
import csv
import numpy as np
import operator
import re
import pandas as pd
import itertools

In [ ]:
import holoviews as hv
# Add markersize option for hv.Curve
#hv.plotting.mpl.CurvePlot.style_opts += [u'markersize']
options = hv.Store.options(backend='matplotlib')
options.Curve.groups['style'].allowed_keywords += ['markersize']
# Add holoviews notebook magic
hv.notebook_extension('matplotlib')

In [ ]:
subject_names = ('all', 'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2')

In [ ]:
csvfnm = 'regularisation_runs/AUC_scores2.csv'

Load the CSV with Pandas


In [ ]:
df1 = pd.read_csv(csvfnm, sep='\t', header=0)
# Drop any duplicated rows from repeated runs of the same thing
#df = df.drop_duplicates(inplace=True)
df1

We need to parse the RUN_NAME column to extract the features which couldn't go into the CSV file as their own columns.


In [ ]:
# We need to parse the RUN_NAME column to extract the features which couldn't go into the CSV file as their own columns.
extra_columns = df1.RUN_NAME.str.split('_').tolist()

#  Splitting on underscores gets out all the fields except the regularisation entry.
# We need to split this on the equals sign, and check that the first part is always the same!
field_name = None
for index, row in enumerate(extra_columns):
    this_field_name, param_value = row[1].split('=')
    if field_name is None:
        field_name = this_field_name
    elif field_name is not this_field_name:
        raise(ValueError('fieldname mismatch'))
    extra_columns[index][1] = float(param_value)

for index, row in enumerate(extra_columns):
    extra_columns[index][4] = int(row[4])

df2 = pd.DataFrame(extra_columns,
                   columns = ['classifier_name', field_name, 'modtyp', 'featureset', 'n_splits'])

In [ ]:
len(extra_columns)

In [ ]:
extra_columns

In [ ]:
df = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
df

In [ ]:
%%opts Scatter plot[logx=True] Curve plot[logx=True]

fwible = [(
            (subject_name, clf_name, modtyp, featureset, n_splits),
            hv.Curve(df[
                        (df['classifier_name']==clf_name) &
                        (df['modtyp']==modtyp) &
                        (df['featureset']==featureset) &
                        (df['n_splits']==n_splits)
                     ][['C', subject_name]].sort_values(by='C').values, kdims=['C'], vdims=['AUROC'])
          )
          for (subject_name, clf_name, modtyp, featureset, n_splits) in itertools.product(
            subject_names,
            ['LR'],  # df['classifier_name'].unique(), ['LR', 'SVC']
            ['raw','ica'],  # df['modtyp'].unique(),
            df['featureset'].unique(),
            df['n_splits'].unique()
            )
         ]

In [ ]:
hvm = hv.HoloMap(fwible, kdims=('subject_name', 'classifier_name', 'modtyp', 'featureset', 'n_splits'))
hvm

In [ ]:
%%output size=200
%%opts Curve (marker='o', markersize=10, color=Palette('Set1'))
%%opts NdOverlay [legend_position='bottom_left']
hvm.overlay('subject_name')

In [ ]:
%%output size=120
hvm.overlay('classifier_name').layout('modtyp').cols(1)

In [ ]:
%%output size=120
hvm.overlay('n_splits').layout('modtyp').cols(1)

In [ ]:
%%output size=200
hvm.overlay('modtyp').layout('classifier_name').cols(1)

In [ ]:
%%output size=200
hvm.overlay('featureset')

In [ ]:
%%output size=120
hvm.overlay(['n_splits', 'modtyp']).layout('featureset').cols(2)